def create_download_link(df=pd.DataFrame, title = "Click to Download", filename=str, level='GP'):
if level=='Village':
csv = df.to_csv(index=True,index_label=['District','Janpad','Gram Panchayat','Village'])
elif level=='GP':
csv = df.to_csv(index=True,index_label=['District','Janpad','Gram Panchayat'])
elif level=='Janpad':
csv = df.to_csv(index=True,index_label=['District','Janpad'])
elif level=='District':
csv = df.to_csv(index=True,index_label=['District'])
else:
csv = df.to_csv(index=True,index_label=['District','Janpad','Gram Panchayat'])
b64 = base64.b64encode(csv.encode())
payload = b64.decode()
html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
###html = '<input type="button" value="Download" download="{filename}" onclick="data:text/csv;base64,{payload}" target="_blank">{title}</input>'
html = html.format(payload=payload,title=title,filename=filename)
return HTML(html)
True - Yes
False - No
raw_data.empty
Based on the combination - District+Janpad+Gram Panchayat+Village+Name of head of household+GOI ID
raw_data["is_duplicate_record"] = raw_data.duplicated(subset=("District",
"Janpad",
"Gram Panchayat",
"Village",
"Name of head of household",
"GOI ID"),
keep='last')
Deduplicating the data
Number of unique Records -
a. All the values in Column_1 are missing. b. There is no other missing value error.
"""Data Error Condition 3: Number of Records where toilet is available but type of toilet has been mentioned उपलब्ध नहीं है"""
total_hh=np.sum(raw_data['Total Household'])
raw_data_tlt_nttype = raw_data[(raw_data["Has Toilet"] == 'हाँ') & (raw_data["Type of toilet"] == 'उपलब्ध नहीं है')]
x=raw_data_tlt_nttype.shape[0]
y=round(x/total_hh*100,2)
print("No of Records:",x," (",y,"%)")
"""Data Error Condition 4: Number of Records where toilet is not available but type of toilet has been mentioned other than उपलब्ध नहीं है"""
raw_data_ntlt_ttype=raw_data[(raw_data["Has Toilet"] == 'नहीं') & (raw_data["Type of toilet"] != 'उपलब्ध नहीं है')]
x=raw_data_ntlt_ttype.shape[0]
y=round(x/total_hh*100,2)
print("No of Records:",x," (",y,"%)")
"""Data Error Condition 3: Number of Records where water source is available but type of source has been mentioned as not available"""
raw_data_ws_nwstype=raw_data[(raw_data["Has Water Source"] == 'हाँ') & ((raw_data["Water source"] == 'उपलब्ध नहीं है') | (raw_data["Water source"] == 'पानी का कोई स्त्रोत नहीं'))]
x=raw_data_ws_nwstype.shape[0]
y=round(x/total_hh*100,2)
print("No of Records:",x," (",y,"%)")
"""Data Error Condition 3: Number of Records where water source is not available but type of source has been mentioned as available"""
raw_data_nws_wstype=raw_data[(raw_data["Has Water Source"] == 'नहीं') & (raw_data["Water source"] != 'उपलब्ध नहीं है') & (raw_data["Water source"] != 'पानी का कोई स्त्रोत नहीं')]
x=raw_data_nws_wstype.shape[0]
y=round(x/total_hh*100,2)
print("No of Records:",x," (",y,"%)")
print("Number of households having different types toilet")
print("--------------------------------------------------")
hh_hvng_tlt_type_cnt = hh_with_toilet["Type of toilet"].apply(lambda x: "Single Pit" if x=="एक गड्ढे वाला" else ("Double Pit" if x=="दो गड्ढे वाला" else ("Septic Tank" if x=="सेप्टिक टैंक" else ("Septic Tank with Soak Pit" if x=="सोखते गड्ढे के साथ सेप्टिक टैंक" else ("Bio Toilet" if x== "बायो-शौचालय" else "Not Available"))))).value_counts()
print(hh_hvng_tlt_type_cnt)
print("--------------------------------------------------")
display_donut(hh_with_toilet["Type of toilet"].apply(lambda x: "Single Pit" if x=="एक गड्ढे वाला" else ("Double Pit" if x=="दो गड्ढे वाला" else ("Septic Tank" if x=="सेप्टिक टैंक" else ("Septic Tank with Soak Pit" if x=="सोखते गड्ढे के साथ सेप्टिक टैंक" else ("Bio Toilet" if x== "बायो-शौचालय" else "Not Available"))))),
width=5,
height=5,
title="% of Households having different Types of Toilets",
pct=True)
print("Number of households having water source")
print("-----------------------")
display(raw_data["Has Water Source"].apply(lambda x: "Yes" if x=="हाँ" else "No").value_counts())
display_donut(raw_data["Has Water Source"].apply(lambda x: "Yes" if x=="हाँ" else "No"),
width=5,
height=5,
title="Availability of Water Source in Households",
pct=True)
Availability of water source in households across different economic status
Availability of water source in households across different social strata
Number of duplicate records region wise
dup_rec=raw_data_dup.pivot_table(index=["District","Janpad","Gram Panchayat"],
values='Total Household',
aggfunc=np.sum).sort_values('Total Household', ascending=False)
create_download_link(dup_rec,filename="Household Verification - Duplicate Surveys by region.csv")
Status of toilets across State and Districts
hh_tnu_tlt_status = raw_data.pivot_table(index=['District','Janpad','Gram Panchayat'],
values=['Total Household',
'% Headed by Women',
'% Headed by PWD',
'% Having No Toilet',
'% Having No Water Source',
'% Households having Toilet but no Water Source'],
aggfunc={'Total Household':np.sum,
'% Headed by Women':np.mean,
'% Headed by PWD':np.mean,
'% Having No Toilet':np.mean,
'% Having No Water Source':np.mean,
'% Households having Toilet but no Water Source':np.mean},
margins=True).apply(lambda x: round(x,2))
create_download_link(hh_tnu_tlt_status, filename="Household Verification - Status of Toilets in the region.csv")
Number of Households across different socio-economic categories
hh_by_scio_eco=raw_data.pivot_table(index=["District","Janpad", "Gram Panchayat"],
columns=["Economic status", "Sub-category"],
values=['Total Household'],
aggfunc={'Total Household':np.sum},
margins=True)
create_download_link(hh_by_scio_eco,filename="Household Verification - Distribution of Households by Socio Economic Status.csv")
List of Households Not Having Toilet
hh_no_tlt=raw_data[raw_data['Has Toilet']!="हाँ"].pivot_table(index=["District",
"Janpad",
"Gram Panchayat",
"Village"],
values='Total Household',
aggfunc=np.sum).sort_values('Total Household', ascending=False)
create_download_link(hh_no_tlt,filename="Household Verification - Households not having toilet.csv",level="Village")
Number of records where toilet is available in the household but type of toilet has either not been provided or mentioned as not available
raw_data_tlt_nttype_pv=raw_data_tlt_nttype.pivot_table(index=["District","Janpad","Gram Panchayat"],
values='Total Household',
aggfunc=np.sum).sort_values('Total Household', ascending=False)
create_download_link(raw_data_tlt_nttype_pv,filename="Household Verification Error - Missing/Incorect Type of Toilet.csv")
Number of records where toilet is not available in the household but type of toilet has been mentioned
raw_data_ntlt_ttype_pv=raw_data_ntlt_ttype.pivot_table(index=["District","Janpad","Gram Panchayat"],
values='Total Household',
aggfunc=np.sum).sort_values('Total Household', ascending=False)
create_download_link(raw_data_ntlt_ttype_pv,filename="Household Verification Error - No Toilet but Type Mentioned.csv")
Number of records where water source is available in the household but type of source has either not been provided or mentioned as not available
raw_data_ws_nwstype_pv=raw_data_ws_nwstype.pivot_table(index=["District","Janpad","Gram Panchayat"],
values='Total Household',
aggfunc=np.sum).sort_values('Total Household', ascending=False)
create_download_link(raw_data_ws_nwstype_pv,filename="Household Verification Error - Water Source Missing.csv")
Number of records where water source is not available in the household but type of source has been provided
raw_data_nws_wstype_pv=raw_data_nws_wstype.pivot_table(index=["District","Janpad","Gram Panchayat"],
values='Total Household',
aggfunc=np.sum).sort_values('Total Household', ascending=False)
create_download_link(raw_data_nws_wstype_pv,filename="Household Verification Error - No Water but Source Provided.csv")